Face Recognition

Classification

In [1]:
import sys
sys.path.append('../utils/')
In [2]:
from ImageUtils import *

import numpy as np
import pandas as pd # Needs the package Pandas to be installed. Check Anaconda Environments and Packages.
from sklearn.decomposition import PCA # Needs SciKit Learn package to be installed. Check Anaconda Environments and Packages.4
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, classification_report
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
from scipy.cluster.hierarchy import linkage, dendrogram
from sklearn.model_selection import KFold
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from mpl_toolkits import mplot3d

Data preparation

Load dataset

In [3]:
faces94_male = readFaces94MaleFaces(gray=True)
faces94_female = readFaces94FemaleFaces(gray=True)
faces94_malestaff = readFaces94MaleStaffFaces(gray=True)
landscapes = np.array(readLandsCapeImage(gray=True))

dataset = np.vstack((faces94_male, faces94_female, faces94_malestaff, landscapes))

labels = np.concatenate((
    np.ones(faces94_male.shape[0]),
    np.full(faces94_female.shape[0], 2),
#     np.full(faces94_malestaff.shape[0], 3),
    np.ones(faces94_malestaff.shape[0]),
    np.zeros(landscapes.shape[0])
))

dataset_N, height, width = dataset.shape

Data centralization and calculate of covariance matrix

In [4]:
dataset_norm = dataset/255
In [5]:
mean = np.mean(dataset_norm.reshape(dataset_N, height*width), axis=0).reshape(height, width)
In [6]:
dataset_norm_cov = np.cov(dataset_norm.reshape(dataset_N, height*width))
dataset_norm_cov.shape
Out[6]:
(3269, 3269)

Dimensionality reduction

Singular Value Decomposition

In [7]:
_,s,_ = np.linalg.svd(dataset_norm_cov)
In [8]:
representation_percentage = 0.85 # Selected variability 
In [9]:
sum_eig = np.sum(s)
percentage_variance = np.divide(s, sum_eig)
sum_var = 0
num_var = 0
for i in np.arange(percentage_variance.shape[0]):
    if sum_var >= representation_percentage:
        num_var = i
        break;
    
    sum_var += percentage_variance[i]
    
num_var
Out[9]:
62
In [10]:
cum_per=np.cumsum(percentage_variance)
for i in range(1,len(s)):
    change=(cum_per[i]-cum_per[i-1])/cum_per[i-1]*100
    if(change<.01):
        num_var1=i-1
        print("First",num_var1, "components with ",cum_per[num_var1]*100,"percent of variability captured and from which the contribution is less than 0.01%")
        break

plt.figure(figsize=(12,6))
plt.plot(cum_per*100)
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Cumulative Summation of the Explained Variance')
plt.show()
First 423 components with  97.32290673185808 percent of variability captured and from which the contribution is less than 0.01%

PCA Principal component analysis (Eigenfaces - Face space)

In [11]:
pca = PCA(n_components=num_var, svd_solver='full').fit(dataset.reshape(dataset_N, height*width))
pca.components_.shape
Out[11]:
(62, 36000)
In [12]:
cols = 4
rows = 4
plt.figure(figsize=(30,20))
for i in np.arange(rows * cols):
    plt.subplot(rows, cols, i + 1)
    plt.imshow(pca.components_[i].reshape(height, width), plt.cm.gray)

Labels classes

In [13]:
dataset_male = np.vstack((faces94_male, faces94_malestaff))
In [14]:
dataset_male.shape
Out[14]:
(2660, 200, 180)

within-class

In [15]:
mean_male = np.mean(dataset_male.reshape(dataset_male.shape[0], height*width)/255, axis=0).reshape(height, width)
mean_female = np.mean(faces94_female.reshape(faces94_female.shape[0], height*width)/255, axis=0).reshape(height, width)
mean_landscape = np.mean(landscapes.reshape(landscapes.shape[0], height*width)/255, axis=0).reshape(height, width)
In [16]:
fig = plt.figure(figsize=(10,6))

ax1 = fig.add_subplot(1,3,1)
plt.title("Mean Male")
ax1.imshow(mean_male*255, plt.cm.gray)

ax2 = fig.add_subplot(1,3,2)
plt.title("Mean Female")
ax2.imshow(mean_female*255, plt.cm.gray)

ax3 = fig.add_subplot(1,3,3)
plt.title("Mean Landscapes")
ax3.imshow(mean_landscape*255, plt.cm.gray)
Out[16]:
<matplotlib.image.AxesImage at 0x17c03b4c588>
In [17]:
male_cov = np.cov(np.subtract(dataset_male/255, mean_male).reshape(dataset_male.shape[0], height*width))
male_cov.shape
Out[17]:
(2660, 2660)
In [18]:
female_cov = np.cov(np.subtract(faces94_female/255, mean_female).reshape(faces94_female.shape[0], height*width))
female_cov.shape
Out[18]:
(399, 399)
In [19]:
landscape_cov = np.cov(np.subtract(landscapes/255, mean_landscape).reshape(landscapes.shape[0], height*width))
landscape_cov.shape
Out[19]:
(210, 210)
In [20]:
landscape_base_matrix = np.ones((landscapes.shape[0], height*width))
male_base_matrix = np.ones((dataset_male.shape[0], height*width))
female_base_matrix = np.ones((faces94_female.shape[0], height*width))

Projection images on face space

In [21]:
dataset_projected = pca.transform(dataset.reshape(dataset_N, height*width))
dataset_projected.shape
Out[21]:
(3269, 62)

Variance ratio PCA

In [22]:
pca.explained_variance_ratio_
Out[22]:
array([0.25795487, 0.1123311 , 0.06663033, 0.04262892, 0.03617492,
       0.03195197, 0.02660525, 0.02160968, 0.01746718, 0.01618394,
       0.01330534, 0.01235961, 0.01178076, 0.01034328, 0.00958855,
       0.00872421, 0.00787258, 0.00722736, 0.00715449, 0.00659753,
       0.00591367, 0.00578477, 0.00555298, 0.00538206, 0.00519405,
       0.00484171, 0.00462915, 0.00444058, 0.00421906, 0.00405604,
       0.00385099, 0.0036732 , 0.00363602, 0.0034641 , 0.00342827,
       0.00332731, 0.0031762 , 0.00301246, 0.00293804, 0.00281456,
       0.00273089, 0.00270494, 0.00254694, 0.00252305, 0.00240922,
       0.00237795, 0.00221538, 0.00218955, 0.00212681, 0.00207602,
       0.00199347, 0.00196167, 0.00191755, 0.00186262, 0.00183665,
       0.00179993, 0.00173835, 0.00170884, 0.00169296, 0.00164815,
       0.00161066, 0.00156615], dtype=float32)

Unsupervised image clustering - K means model

In [23]:
#k-means
kmeans = KMeans(n_clusters=3, random_state=42).fit(dataset_projected)
wcentroids=kmeans.cluster_centers_
wcentroids.shape
Out[23]:
(3, 62)
In [24]:
cols = 3
rows = 1
plt.figure(figsize=(10,6))
for i in np.arange(rows * cols):
    plt.subplot(rows, cols, i + 1)
    plt.title("Class "+str(i+1))
    plt.imshow((np.dot(kmeans.cluster_centers_[i],pca.components_)+mean.reshape(height*width)).reshape(height, width), plt.cm.gray)
In [25]:
y_label=kmeans.labels_
wtotaldist=kmeans.transform(dataset_projected)
wdistances = np.amin(wtotaldist, axis=1)
print(wdistances.shape[0])
3269

Class 1

In [26]:
#class 1 k-means
kclass=0
print("Number images: "+str(wdistances[y_label==kclass].shape[0]))
Number images: 1080
In [27]:
histbox(wdistances[y_label==kclass])
In [28]:
CVresult={'w distances':wdistances,'label':y_label}
df = pd.DataFrame(CVresult)
df.sort_values('w distances', axis = 0, ascending = True, inplace = True, na_position ='first')
df2=df.loc[df[df.columns[1]]==kclass]
df2.head(6)
Out[28]:
w distances label
3033 5406.113770 0
3034 5413.460449 0
3251 5425.819824 0
3024 5537.243164 0
3140 5603.247070 0
3038 5618.295410 0
In [29]:
#low distances class1
cols = 4
rows = 3
plt.figure(figsize=(24,16))
for i in np.arange(rows * cols):
    plt.subplot(rows, cols, i + 1)
    plt.title("Class1 low distance "+ str(df2['w distances'][df2.index[i]]),fontsize=13)
    plt.imshow(dataset[df2.index[i]], plt.cm.gray)
In [30]:
#High distances class1
cols = 4
rows = 1
plt.figure(figsize=(15,10))
for i in np.arange(rows * cols):
    plt.subplot(rows, cols, i + 1)
    plt.title("Class1 high distance "+ str(df2['w distances'][df2.index[-(i+1)]]),fontsize=10)
    plt.imshow(dataset[df2.index[-(i+1)]], plt.cm.gray)

Class 2

In [31]:
#class 2 k-means
kclass=1
print("Number images: "+str(wdistances[y_label==kclass].shape[0]))
Number images: 1173
In [32]:
histbox(wdistances[y_label==kclass])
In [33]:
CVresult={'w distances':wdistances,'label':y_label}
df = pd.DataFrame(CVresult)
df.sort_values('w distances', axis = 0, ascending = True, inplace = True, na_position ='first')
df2=df.loc[df[df.columns[1]]==kclass]
df2.head(6)
Out[33]:
w distances label
382 3566.893066 1
399 3590.540283 1
397 3593.110596 1
383 3626.149414 1
398 3655.169678 1
381 3659.193604 1
In [34]:
#low distances class2
cols = 4
rows = 3
plt.figure(figsize=(24,16))
for i in np.arange(rows * cols):
    plt.subplot(rows, cols, i + 1)
    plt.title("Class2 low distance "+ str(df2['w distances'][df2.index[i]]),fontsize=13)
    plt.imshow(dataset[df2.index[i]], plt.cm.gray)
In [35]:
#High distances class2
cols = 4
rows = 1
plt.figure(figsize=(15,10))
for i in np.arange(rows * cols):
    plt.subplot(rows, cols, i + 1)
    plt.title("Class2 high distance "+ str(df2['w distances'][df2.index[-(i+1)]]),fontsize=10)
    plt.imshow(dataset[df2.index[-(i+1)]], plt.cm.gray)

Class 3

In [36]:
#class 3 k-means
kclass=2
print("Number images: "+str(wdistances[y_label==kclass].shape[0]))
Number images: 1016
In [37]:
histbox(wdistances[y_label==kclass])
In [38]:
CVresult={'w distances':wdistances,'label':y_label}
df = pd.DataFrame(CVresult)
df.sort_values('w distances', axis = 0, ascending = True, inplace = True, na_position ='first')
df2=df.loc[df[df.columns[1]]==kclass]
df2.head(6)
Out[38]:
w distances label
1542 5184.216797 2
340 5209.929688 2
1546 5211.583496 2
1557 5220.653809 2
341 5222.695801 2
1548 5286.311523 2
In [39]:
#low distances class3
cols = 4
rows = 3
plt.figure(figsize=(24,16))
for i in np.arange(rows * cols):
    plt.subplot(rows, cols, i + 1)
    plt.title("Class3 low distance "+ str(df2['w distances'][df2.index[i]]),fontsize=13)
    plt.imshow(dataset[df2.index[i]], plt.cm.gray)
In [40]:
#High distances class3
cols = 4
rows = 1
plt.figure(figsize=(15,10))
for i in np.arange(rows * cols):
    plt.subplot(rows, cols, i + 1)
    plt.title("Class3 high distance "+ str(df2['w distances'][df2.index[-(i+1)]]),fontsize=10)
    plt.imshow(dataset[df2.index[-(i+1)]], plt.cm.gray)
In [41]:
labelsk = np.concatenate((
    np.full(faces94_male.shape[0],2),
    np.ones(faces94_female.shape[0]),
    np.full(faces94_malestaff.shape[0],2),
    np.zeros(landscapes.shape[0])
))
In [42]:
cm=confusion_matrix(labelsk, y_label).ravel()

plt.figure()
plt.title("Heatmap")
prediction_data = {'y_Actual': labelsk,'y_Predicted': y_label}
df = pd.DataFrame(prediction_data, columns=['y_Actual','y_Predicted'])
confusionmatrix1 = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'])
ax=sns.heatmap(confusionmatrix1, annot=True,cmap='Blues', fmt='.0f');
ax.xaxis.set_ticklabels(['landscape', 'female', 'male']); ax.yaxis.set_ticklabels(['landscape', 'female', 'male']);
ax.invert_yaxis()
In [43]:
accuracy_score(y_true=labelsk, y_pred=y_label)
Out[43]:
0.40715815234016517
In [44]:
precision_score(y_true=labelsk, y_pred=y_label, average=None)
Out[44]:
array([0.11111111, 0.22335891, 0.93405512])
In [45]:
print(classification_report(y_true=labelsk, y_pred=y_label, target_names=["landscape", "woman", "man"]))
              precision    recall  f1-score   support

   landscape       0.11      0.57      0.19       210
       woman       0.22      0.66      0.33       399
         man       0.93      0.36      0.52      2660

   micro avg       0.41      0.41      0.41      3269
   macro avg       0.42      0.53      0.35      3269
weighted avg       0.79      0.41      0.47      3269

In [46]:
Y=kmeans.transform(dataset_projected)
plt.figure(figsize=(10,8))
ax = plt.axes(projection='3d')
ax.scatter(Y[np.where(y_label==0),0],Y[np.where(y_label==0),1] ,Y[np.where(y_label==0),2], cmap='viridis', linewidth=1);
ax.scatter(Y[np.where(y_label==1),0],Y[np.where(y_label==1),1] ,Y[np.where(y_label==1),2], cmap='viridis', linewidth=1);
ax.scatter(Y[np.where(y_label==2),0],Y[np.where(y_label==2),1] ,Y[np.where(y_label==2),2], cmap='viridis', linewidth=1);
plt.gca().legend(('class 1','class 2','class 3'))
Out[46]:
<matplotlib.legend.Legend at 0x17c041d65c0>

Supervised image classification - LDA model

Linear discriminant analysis

In [47]:
lda = LinearDiscriminantAnalysis(n_components=2)
lda.fit(dataset_projected, labels)
Out[47]:
LinearDiscriminantAnalysis(n_components=2, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)
In [48]:
dataset_lda = lda.transform(dataset_projected)
In [49]:
colors = ['navy', 'turquoise', 'darkorange']
classes = ['landscapes', 'male', 'female']
In [50]:
plt.figure(figsize=(10,8))
for color, i, class_name in zip(colors, np.arange(0, 3), classes):
    plt.scatter(dataset_lda[labels == i, 0], dataset_lda[labels == i, 1], alpha=.8, color=color,
                label=class_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.xlabel('LDA Eigenvector 1')
plt.ylabel('LDA Eigenvector 2')
plt.title('LDA of EigenFaces distribution')
Out[50]:
Text(0.5, 1.0, 'LDA of EigenFaces distribution')
In [51]:
lda.explained_variance_ratio_
Out[51]:
array([0.52496645, 0.47503355])

Dataset projected with LDA

In [52]:
lda.predict(dataset_projected)
Out[52]:
array([1., 1., 1., ..., 0., 1., 0.])

Class 1 - landscape

In [53]:
CVresult={'lda':dataset_lda[:,0],'label':lda.predict(dataset_projected)}
df = pd.DataFrame(CVresult)
df.sort_values('lda', axis = 0, ascending = True, inplace = True, na_position ='first')
df2=df.loc[df[df.columns[1]]==0]
df2.head()
Out[53]:
lda label
3063 -11.010850 0.0
3079 -9.442776 0.0
3200 -8.939532 0.0
3207 -8.479890 0.0
3214 -8.331429 0.0
In [54]:
cols = 4
rows = 3
plt.figure(figsize=(24,16))
for i in np.arange(rows * cols):
    plt.subplot(rows, cols, i + 1)
    plt.title("Class1 LDA "+ str(df2['lda'][df2.index[i]]),fontsize=13)
    plt.imshow(dataset[df2.index[i]], plt.cm.gray)

Class 2 - male

In [55]:
CVresult={'lda':dataset_lda[:,1],'label':lda.predict(dataset_projected)}
df = pd.DataFrame(CVresult)
df.sort_values('lda', axis = 0, ascending = True, inplace = True, na_position ='first')
df2=df.loc[df[df.columns[1]]==1]
df2.head()
Out[55]:
lda label
781 -3.033223 1.0
799 -2.865758 1.0
785 -2.765966 1.0
782 -2.765353 1.0
3209 -2.747305 1.0
In [56]:
cols = 4
rows = 3
plt.figure(figsize=(24,16))
for i in np.arange(rows * cols):
    plt.subplot(rows, cols, i + 1)
    plt.title("Class2 LDA "+ str(df2['lda'][df2.index[i]]),fontsize=13)
    plt.imshow(dataset[df2.index[i]], plt.cm.gray)

Class 3 - female

In [57]:
CVresult={'lda':dataset_lda[:,1],'label':lda.predict(dataset_projected)}
df = pd.DataFrame(CVresult)
df.sort_values('lda', axis = 0, ascending = False, inplace = True, na_position ='first')
df2=df.loc[df[df.columns[1]]==2]
df2.head()
Out[57]:
lda label
2306 5.703355 2.0
2310 5.669077 2.0
2302 5.660583 2.0
2307 5.641720 2.0
2303 5.634252 2.0
In [58]:
cols = 4
rows = 3
plt.figure(figsize=(24,16))
for i in np.arange(rows * cols):
    plt.subplot(rows, cols, i + 1)
    plt.title("Class3 LDA "+ str(df2['lda'][df2.index[i]]),fontsize=13)
    plt.imshow(dataset[df2.index[i]], plt.cm.gray)

TSNE (T-distributed stochastic neighbor embedding)

high-dimensional data for visualization

In [59]:
perplexities = [5, 30, 50, 100]
(fig, subplots) = plt.subplots(1, 4, figsize=(20, 8))
plt.axis('tight')

landscapes_class = 0
male_class = 1
female_class = 2

for i, perplexity in enumerate(perplexities):
    ax = subplots[i]

    tsne = TSNE(n_components=2, init='random',
                         random_state=0, perplexity=perplexity)
    dataset_tsne = tsne.fit_transform(dataset_projected)
    landscapes_idx = labels == landscapes_class
    male_idx = labels == male_class
    female_idx = labels == female_class
    
    ax.set_title("t-SNE Eigenfaces Perplexity=%d" % perplexity)
    
    ax.scatter(dataset_tsne[landscapes_idx, 0], dataset_tsne[landscapes_idx, 1], c=colors[landscapes_class])
    ax.scatter(dataset_tsne[male_idx, 0], dataset_tsne[male_idx, 1], c=colors[male_class])
    ax.scatter(dataset_tsne[female_idx, 0], dataset_tsne[female_idx, 1], c=colors[female_class])
    
    ax.xaxis.set_major_formatter(NullFormatter())
    ax.yaxis.set_major_formatter(NullFormatter())
    ax.axis('tight')

train test supervised models low-dimensional PCA (Logistic Regression - LDA)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(dataset_projected, labels, test_size=0.3, stratify=labels)

Supervised image classification - Logistic Regression

In [61]:
classifier = LogisticRegression(solver='newton-cg', multi_class='multinomial')
classifier.fit(X_train, y_train)
C:\Users\jetor\Anaconda3\lib\site-packages\sklearn\utils\optimize.py:203: ConvergenceWarning: newton-cg failed to converge. Increase the number of iterations.
  "number of iterations.", ConvergenceWarning)
Out[61]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)
In [62]:
y_test_pred = classifier.predict(X_test)
In [63]:
accuracy_score(y_true=y_test, y_pred=y_test_pred)
Out[63]:
0.9745158002038736
In [64]:
precision_score(y_true=y_test, y_pred=y_test_pred, average=None)
Out[64]:
array([0.9       , 0.98740554, 0.92913386])
In [65]:
print(classification_report(y_true=y_test, y_pred=y_test_pred, target_names=["landscape", "man", "woman"]))
              precision    recall  f1-score   support

   landscape       0.90      0.86      0.88        63
         man       0.99      0.98      0.98       798
       woman       0.93      0.98      0.96       120

   micro avg       0.97      0.97      0.97       981
   macro avg       0.94      0.94      0.94       981
weighted avg       0.97      0.97      0.97       981

In [66]:
plt.figure()
plt.title("Heatmap")
classes_dict = {'Actual': y_test.tolist(), 'Predicted': y_test_pred.tolist()}
classes_df = pd.DataFrame(classes_dict, columns=["Actual", "Predicted"])
conf_matrix = pd.crosstab(classes_df['Actual'], classes_df['Predicted'], rownames=['Actual'], colnames=['Predicted'])
ax=sns.heatmap(conf_matrix, annot=True,cmap='Blues', fmt='.0f');
ax.xaxis.set_ticklabels(['landscape', 'male', 'female']); ax.yaxis.set_ticklabels(['landscape', 'male', 'female']);
ax.invert_yaxis()

Supervised image classification - LDA

In [67]:
classifier_lda = LinearDiscriminantAnalysis(n_components=2)
classifier_lda.fit(X_train, y_train)
Out[67]:
LinearDiscriminantAnalysis(n_components=2, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)
In [68]:
y_test_pred = classifier_lda.predict(X_test)
In [69]:
accuracy_score(y_true=y_test, y_pred=y_test_pred)
Out[69]:
0.9235474006116208
In [70]:
precision_score(y_true=y_test, y_pred=y_test_pred, average=None)
Out[70]:
array([0.88679245, 0.93932039, 0.81730769])
In [71]:
print(classification_report(y_true=y_test, y_pred=y_test_pred, target_names=["landscape", "man", "woman"]))
              precision    recall  f1-score   support

   landscape       0.89      0.75      0.81        63
         man       0.94      0.97      0.95       798
       woman       0.82      0.71      0.76       120

   micro avg       0.92      0.92      0.92       981
   macro avg       0.88      0.81      0.84       981
weighted avg       0.92      0.92      0.92       981

In [72]:
plt.figure()
plt.title("Heatmap")
classes_dict = {'Actual': y_test.tolist(), 'Predicted': y_test_pred.tolist()}
classes_df = pd.DataFrame(classes_dict, columns=["Actual", "Predicted"])
conf_matrix = pd.crosstab(classes_df['Actual'], classes_df['Predicted'], rownames=['Actual'], colnames=['Predicted'])
ax=sns.heatmap(conf_matrix, annot=True,cmap='Blues', fmt='.0f');
ax.xaxis.set_ticklabels(['landscape', 'male', 'female']); ax.yaxis.set_ticklabels(['landscape', 'male', 'female']);
ax.invert_yaxis()

train test supervised models low-dimensional LDA (Logistic Regression - LDA)

In [73]:
X_train, X_test, y_train, y_test = train_test_split(dataset_lda, labels, test_size=0.3, stratify=labels)

Supervised image classification - Logistic Regression

In [74]:
classifier = LogisticRegression(solver='newton-cg', multi_class='multinomial')
classifier.fit(X_train, y_train)
Out[74]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)
In [75]:
y_test_pred = classifier.predict(X_test)
In [76]:
accuracy_score(y_true=y_test, y_pred=y_test_pred)
Out[76]:
0.9429153924566769
In [77]:
precision_score(y_true=y_test, y_pred=y_test_pred, average=None)
Out[77]:
array([0.94444444, 0.95577396, 0.84955752])
In [78]:
print(classification_report(y_true=y_test, y_pred=y_test_pred, target_names=["landscape", "man", "woman"]))
              precision    recall  f1-score   support

   landscape       0.94      0.81      0.87        63
         man       0.96      0.97      0.97       798
       woman       0.85      0.80      0.82       120

   micro avg       0.94      0.94      0.94       981
   macro avg       0.92      0.86      0.89       981
weighted avg       0.94      0.94      0.94       981

In [79]:
plt.figure()
plt.title("Heatmap")
classes_dict = {'Actual': y_test.tolist(), 'Predicted': y_test_pred.tolist()}
classes_df = pd.DataFrame(classes_dict, columns=["Actual", "Predicted"])
conf_matrix = pd.crosstab(classes_df['Actual'], classes_df['Predicted'], rownames=['Actual'], colnames=['Predicted'])
ax=sns.heatmap(conf_matrix, annot=True,cmap='Blues', fmt='.0f');
ax.xaxis.set_ticklabels(['landscape', 'male', 'female']); ax.yaxis.set_ticklabels(['landscape', 'male', 'female']);
ax.invert_yaxis()

train test supervised models low-dimensional TSNE (Logistic Regression)

In [80]:
tsne = TSNE(n_components=2, init='random',
                     random_state=0, perplexity=80)
dataset_tsne = tsne.fit_transform(dataset_projected)
In [81]:
dataset_tsne.shape
Out[81]:
(3269, 2)
In [82]:
X_train, X_test, y_train, y_test = train_test_split(dataset_tsne, labels, test_size=0.3, stratify=labels)
In [83]:
classifier = LogisticRegression(solver='newton-cg', multi_class='multinomial')
classifier.fit(X_train, y_train)
C:\Users\jetor\Anaconda3\lib\site-packages\scipy\optimize\linesearch.py:313: LineSearchWarning: The line search algorithm did not converge
  warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\jetor\Anaconda3\lib\site-packages\sklearn\utils\optimize.py:195: UserWarning: Line Search failed
  warnings.warn('Line Search failed')
Out[83]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)
In [84]:
y_test_pred = classifier.predict(X_test)
In [85]:
accuracy_score(y_true=y_test, y_pred=y_test_pred)
Out[85]:
0.8491335372069317
In [86]:
precision_score(y_true=y_test, y_pred=y_test_pred, average=None)
C:\Users\jetor\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1143: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
Out[86]:
array([0.        , 0.86516854, 0.69230769])
In [87]:
print(classification_report(y_true=y_test, y_pred=y_test_pred, target_names=["landscape", "man", "woman"]))
              precision    recall  f1-score   support

   landscape       0.00      0.00      0.00        63
         man       0.87      0.96      0.91       798
       woman       0.69      0.53      0.60       120

   micro avg       0.85      0.85      0.85       981
   macro avg       0.52      0.50      0.50       981
weighted avg       0.79      0.85      0.82       981

C:\Users\jetor\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1143: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
C:\Users\jetor\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1143: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
C:\Users\jetor\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1143: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
In [88]:
plt.figure()
plt.title("Heatmap")
classes_dict = {'Actual': y_test.tolist(), 'Predicted': y_test_pred.tolist()}
classes_df = pd.DataFrame(classes_dict, columns=["Actual", "Predicted"])
conf_matrix = pd.crosstab(classes_df['Actual'], classes_df['Predicted'], rownames=['Actual'], colnames=['Predicted'])
ax=sns.heatmap(conf_matrix, annot=True,cmap='Blues', fmt='.0f');
ax.xaxis.set_ticklabels(['male', 'female']); ax.yaxis.set_ticklabels(['landscape', 'male', 'female']);
ax.invert_yaxis()

Aglomerative Clustering Dendogram

In [89]:
linkage_matrix = linkage(y=dataset_lda, method='weighted')
In [90]:
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    linkage_matrix,
    p=3,truncate_mode='level'
)
plt.show()
In [ ]: